<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Aggregations and Analysis | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="docvalues-and-fielddata.html" title="Doc Values and Fielddata">
<link rel="prev" href="_deep_dive_on_doc_values.html" title="Deep Dive on Doc Values">
<link rel="next" href="_limiting_memory_usage.html" title="Limiting Memory Usage">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="aggregations.html">Aggregations</a></span>
»
<span class="breadcrumb-link"><a href="docvalues-and-fielddata.html">Doc Values and Fielddata</a></span>
»
<span class="breadcrumb-node">Aggregations and Analysis</span>
</div>
<div class="navheader">
<span class="prev">
<a href="_deep_dive_on_doc_values.html">« Deep Dive on Doc Values</a>
</span>
<span class="next">
<a href="_limiting_memory_usage.html">Limiting Memory Usage »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="aggregations-and-analysis"></a>Aggregations and Analysis<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/300_Aggregations/95_analyzed_vs_not.asciidoc">edit</a>
</h2>
</div></div></div>
<p>Some aggregations, such as the <code class="literal">terms</code> bucket, operate on string fields.  And
string fields may be either <code class="literal">analyzed</code> or <code class="literal">not_analyzed</code>, which raises the question:
how does analysis affect aggregations?</p>
<p>The answer is "a lot," for two reasons: analysis affects the tokens used in the aggregation,
and doc values <em>do not work</em> with analyzed strings.</p>
<p>Let’s tackle the first problem: how the generation of analyzed tokens affects
aggregations. First, let’s index some documents representing various states in the US:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">POST /agg_analysis/data/_bulk
{ "index": {}}
{ "state" : "New York" }
{ "index": {}}
{ "state" : "New Jersey" }
{ "index": {}}
{ "state" : "New Mexico" }
{ "index": {}}
{ "state" : "New York" }
{ "index": {}}
{ "state" : "New York" }</pre>
</div>
<p>We want to build a list of unique states in our dataset, complete with counts.
Simple—​let’s use a <code class="literal">terms</code> bucket:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">GET /agg_analysis/data/_search
{
    "size" : 0,
    "aggs" : {
        "states" : {
            "terms" : {
                "field" : "state"
            }
        }
    }
}</pre>
</div>
<p>This gives us these results:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">{
...
   "aggregations": {
      "states": {
         "buckets": [
            {
               "key": "new",
               "doc_count": 5
            },
            {
               "key": "york",
               "doc_count": 3
            },
            {
               "key": "jersey",
               "doc_count": 1
            },
            {
               "key": "mexico",
               "doc_count": 1
            }
         ]
      }
   }
}</pre>
</div>
<p>Oh dear, that’s not at all what we want!  Instead of counting states, the aggregation
is counting individual words.  The underlying reason is simple: aggregations
are built from the inverted index, and the inverted index is <em>post-analysis</em>.</p>
<p>When we added those documents to Elasticsearch, the string <code class="literal">"New York"</code> was
analyzed/tokenized into <code class="literal">["new", "york"]</code>.  These individual tokens were then
used to populate aggregation counts, and ultimately we see counts for <code class="literal">new</code> instead of
<code class="literal">New York</code>.</p>
<p>This is obviously not the behavior that we wanted, but luckily it is easily
corrected.</p>
<p>We need to define a multifield for <code class="literal">state</code> and set it to <code class="literal">not_analyzed</code>.  This
will prevent <code class="literal">New York</code> from being analyzed, which means it will stay a single
token in the aggregation.  Let’s try the whole process over, but this time
specify a <em>raw</em> multifield:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">DELETE /agg_analysis/
PUT /agg_analysis
{
  "mappings": {
    "data": {
      "properties": {
        "state" : {
          "type": "string",
          "fields": {
            "raw" : {
              "type": "string",
              "index": "not_analyzed"<a id="CO218-1"></a><i class="conum" data-value="1"></i>
            }
          }
        }
      }
    }
  }
}

POST /agg_analysis/data/_bulk
{ "index": {}}
{ "state" : "New York" }
{ "index": {}}
{ "state" : "New Jersey" }
{ "index": {}}
{ "state" : "New Mexico" }
{ "index": {}}
{ "state" : "New York" }
{ "index": {}}
{ "state" : "New York" }

GET /agg_analysis/data/_search
{
  "size" : 0,
  "aggs" : {
    "states" : {
        "terms" : {
            "field" : "state.raw" <a id="CO218-2"></a><i class="conum" data-value="2"></i>
        }
    }
  }
}</pre>
</div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO218-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>This time we explicitly map the <code class="literal">state</code> field and include a <code class="literal">not_analyzed</code> sub-field.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO218-2"><i class="conum" data-value="2"></i></a></p>
</td>
<td align="left" valign="top">
<p>The aggregation is run on <code class="literal">state.raw</code> instead of <code class="literal">state</code>.</p>
</td>
</tr>
</table>
</div>
<p>Now when we run our aggregation, we get results that make sense:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">{
...
   "aggregations": {
      "states": {
         "buckets": [
            {
               "key": "New York",
               "doc_count": 3
            },
            {
               "key": "New Jersey",
               "doc_count": 1
            },
            {
               "key": "New Mexico",
               "doc_count": 1
            }
         ]
      }
   }
}</pre>
</div>
<p>In practice, this kind of problem is easy to spot.  Your aggregations
will simply return strange buckets, and you’ll remember the analysis issue.
It is a generalization, but there are not many instances where you want to use
an analyzed  field in an aggregation.  When in doubt, add a multifield so
you have the option for both.</p>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_analyzed_strings_and_fielddata"></a>Analyzed strings and Fielddata<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/300_Aggregations/95_analyzed_vs_not.asciidoc">edit</a>
</h3>
</div></div></div>
<p>While the first problem relates to how data is aggregated and displayed to your
user, the second problem is largely technical and behind the scenes.</p>
<p>Doc values do not support <code class="literal">analyzed</code> string fields because they are not very efficient
at representing multi-valued strings.  Doc values are most efficient
when each document has one or several tokens, but not thousands as in the case
of large, analyzed strings (imagine a PDF body, which may be several megabytes
and have thousands of unique tokens).</p>
<p>For that reason, doc values are not generated for <code class="literal">analyzed</code> strings.  Yet these fields
can still be used in aggregations.  How is that possible?</p>
<p>The answer is a data structure known as <em>fielddata</em>.  Unlike doc values, fielddata
is built and managed 100% in memory, living inside the JVM heap.  That means
it is inherently less scalable and has a lot of edge-cases to watch out for.
The rest of this chapter are addressing the challenges of fielddata in the context
of <code class="literal">analyzed</code> strings</p>
<div class="note admon">
<div class="icon"></div>
<div class="admon_content">
<p>Historically, fielddata was the default for <em>all</em> fields, but Elasticsearch
has been migrating towards doc values to reduce the chance of OOM.
Analyzed strings are the last holdout where fielddata is still used.  The goal is to
eventually build a serialized data structure similar to doc values which can handle
highly dimensional analyzed strings, obsoleting fielddata once and for all.</p>
</div>
</div>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_high_cardinality_memory_implications"></a>High-Cardinality Memory Implications<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/300_Aggregations/95_analyzed_vs_not.asciidoc">edit</a>
</h3>
</div></div></div>
<p>There is another reason to avoid aggregating analyzed fields: high-cardinality
fields consume a large amount of memory when loaded into fielddata.  The
analysis process often (although not always) generates a large number of tokens,
many of  which are unique.  This increases the overall cardinality of the field
and contributes to more memory pressure.</p>
<p>Some types of analysis are <em>extremely</em> unfriendly with regards to memory.
Consider an n-gram analysis process.  The term <code class="literal">New York</code> might be n-grammed into
the following tokens:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
<code class="literal">ne</code>
</li>
<li class="listitem">
<code class="literal">ew</code>
</li>
<li class="listitem">
<code class="literal">w </code>
</li>
<li class="listitem">
<code class="literal"> y</code>
</li>
<li class="listitem">
<code class="literal">yo</code>
</li>
<li class="listitem">
<code class="literal">or</code>
</li>
<li class="listitem">
<code class="literal">rk</code>
</li>
</ul>
</div>
<p>You can imagine how the n-gramming process creates a huge number of unique tokens,
especially when analyzing paragraphs of text.  When these are loaded into memory,
you can easily exhaust your heap space.</p>
<p>So, before aggregating string fields, assess the situation:</p>
<div class="ulist itemizedlist">
<ul class="itemizedlist">
<li class="listitem">
Is it a <code class="literal">not_analyzed</code> field?  If yes, the field will use doc values and be memory-friendly
</li>
<li class="listitem">
Otherwise, this is an <code class="literal">analyzed</code> field.  It will use fielddata and live in-memory.
Does this field have a very large cardinality caused by ngrams, shingles, etc? If yes,
it may be very memory unfriendly.
</li>
</ul>
</div>
</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="_deep_dive_on_doc_values.html">« Deep Dive on Doc Values</a>
</span>
<span class="next">
<a href="_limiting_memory_usage.html">Limiting Memory Usage »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
